Here using the protein-coding gene subset from previous data processing to conduct differential expression analysis, and also build the main pipeline.

Load data

dir <- "/home/ubuntu/storage/tcga"
pcg_txi <- readRDS(file.path(dir,"txi.split/pcg.txi.rds"),refhook = NULL)
new_cdr <- read.csv(file.path(dir,"clinic/new.filt.cdr.csv"), row.names = 1)
#sample_info <- read.delim(file.path(dir,"clinic/sample.info.tsv"), header=T)
cfa <- pcg_txi$countsFromAbundance

Fitler out samples with large proportion of lowly-expressed genes

Here 5 samples which had extremely low library size were filted out, so the sample size would go down from 6125 to 6120.

counts <- sort(colSums(cfa))
counts[1:10]
## TCGA-YG-AA3N-01A-11R-A38C-07 TCGA-EB-A97M-01A-11R-A38C-07 
##                     245766.7                     314400.9 
## TCGA-ER-A2NB-01A-12R-A18S-07 TCGA-BF-AAOX-01A-11R-A39D-07 
##                     328614.4                     398275.3 
## TCGA-XV-A9VZ-01A-11R-A38C-07 TCGA-BL-A13I-01A-11R-A277-07 
##                     471810.6                    2922073.8 
## TCGA-56-A49D-01A-11R-A24H-07 TCGA-CJ-4642-01B-01R-1305-07 
##                    3502730.1                    6990537.2 
## TCGA-14-0871-01A-01R-1849-01 TCGA-44-2665-01B-06R-A277-07 
##                    8030096.4                    8060743.1
rm_id <- names(counts[1:5])
cfa <- as.data.frame(cfa)[, -which(colnames(cfa) %in% rm_id)]
cfa <- data.matrix(cfa)
new_cdr <- new_cdr[-which(rownames(new_cdr) %in% rm_id),]
dim(cfa)
## [1] 20449  6120
dim(new_cdr)
## [1] 6120   35

Remove lowly-expressed genes

# Create dge object
dge <- DGEList(cfa)

# Filtering - remove rows that consistently have zero or very low counts
keep <- filterByExpr(dge)
table(keep) # remove 4873 genes
## keep
## FALSE  TRUE 
##  4870 15579
dge_filt <- dge[keep, ]

# Draw density plot
lcpm <- cpm(dge, log=T)
lcpm_f <- cpm(dge_filt, log=T)
nsamples <- ncol(dge)
dp_col <- get_colVector(nsamples)
par(mfrow=c(1,2), las=1) 
draw_densityPlot(lcpm, dp_col, nsamples)
## integer(0)
draw_densityPlot(lcpm_f, dp_col, nsamples)

## integer(0)

Normalize

# Apply TMM normalization method (optional...)
dge_filt <- calcNormFactors(dge_filt)

# Draw boxplots to see the effect, using 20 samples.
col_box <- get_colVector(20)
boxplot(lcpm_f[,1:20], las=2, col=col_box, main="log-CPM before normalization")

lcpm_f <- cpm(dge_filt, log=T)
boxplot(lcpm_f[,1:20], las=2, col=col_box, main="log-CPM after TMM normalization")

Laod processed gene annotation file

fData <- read.delim(file.path(dir, "annotation/fData.txt"), header = T, row.names = 1)
geneid <- rownames(dge_filt$counts)
anno <- fData[geneid,]
gl <- as.vector(anno$geneNames)

Apply limma-voom pipeline

1. one factor
meta <- "Metastasisyes"
design1 <- model.matrix(~ Metastasis, data = new_cdr)
head(design1, 5)
##                              (Intercept) Metastasisyes
## TCGA-2Z-A9J3-01A-12R-A38C-07           1             0
## TCGA-EJ-7784-01A-11R-2118-07           1             0
## TCGA-MJ-A850-01A-11R-A355-07           1             0
## TCGA-ZG-A9LM-01A-11R-A41O-07           1             0
## TCGA-BW-A5NQ-01A-11R-A27V-07           1             0

The first coefficient estimates the mean log-expression for non-metastasized samples and play the role of an intercept. The second one estimates the difference between meta v.s. non-meta. This approach is equal to (~ 0 + Metastasis) then use contrast matrix [confirmed].

v1 <- voom(dge_filt, design1, plot = T) 

fit1 <- lmFit(v1, design1)
fit1 <- eBayes(fit1)
tt1 <- topTable(fit1, number=Inf, genelist=gl, coef=meta)
head(tt1, 20)
##                     ID     logFC  AveExpr        t      P.Value
## ENSG00000138778  CENPE 1.1298105 3.008218 16.14582 1.844244e-57
## ENSG00000163993  S100P 2.3508990 2.053039 16.09008 4.380619e-57
## ENSG00000066279   ASPM 1.3395268 3.473725 16.01242 1.455696e-56
## ENSG00000126787 DLGAP5 1.3016569 2.879460 15.81695 2.920507e-55
## ENSG00000178999  AURKB 1.2331805 2.920284 15.80744 3.376327e-55
## ENSG00000112984 KIF20A 1.1832786 3.512663 15.71442 1.389282e-54
## ENSG00000186185 KIF18B 1.1530582 3.072270 15.71481 1.381099e-54
## ENSG00000183856 IQGAP3 1.2645420 3.905030 15.69480 1.870390e-54
## ENSG00000156970  BUB1B 0.9645109 3.413031 15.59714 8.176882e-54
## ENSG00000143228   NUF2 1.1746111 2.642412 15.58172 1.031356e-53
## ENSG00000142945  KIF2C 1.1744055 3.689352 15.39073 1.797319e-52
## ENSG00000175063  UBE2C 1.4345441 4.016835 15.34495 3.548526e-52
## ENSG00000138180  CEP55 1.2631996 3.326489 15.34041 3.795937e-52
## ENSG00000024526 DEPDC1 1.2634682 2.397007 15.16748 4.871793e-51
## ENSG00000075218  GTSE1 0.9520425 2.940667 15.11150 1.106663e-50
## ENSG00000169679   BUB1 1.1069413 3.600840 15.08936 1.529573e-50
## ENSG00000129173   E2F8 1.2971572 1.624598 15.01373 4.606822e-50
## ENSG00000166851   PLK1 1.1938695 3.994129 14.94652 1.221942e-49
## ENSG00000072571   HMMR 1.0366860 2.869561 14.94484 1.252001e-49
## ENSG00000171241 SHCBP1 0.9357017 2.749364 14.92614 1.641161e-49
##                    adj.P.Val        B
## ENSG00000138778 2.873147e-53 119.7610
## ENSG00000163993 3.412283e-53 118.9046
## ENSG00000066279 7.559429e-53 117.7372
## ENSG00000126787 1.051996e-51 114.7435
## ENSG00000178999 1.051996e-51 114.5985
## ENSG00000112984 3.091947e-51 113.2152
## ENSG00000186185 3.091947e-51 113.2077
## ENSG00000183856 3.642350e-51 112.9203
## ENSG00000156970 1.415418e-50 111.4529
## ENSG00000143228 1.606749e-50 111.1865
## ENSG00000142945 2.545494e-49 108.3932
## ENSG00000175063 4.548992e-49 107.7130
## ENSG00000138180 4.548992e-49 107.6514
## ENSG00000024526 5.421262e-48 105.0740
## ENSG00000075218 1.149381e-47 104.2878
## ENSG00000169679 1.489326e-47 103.9872
## ENSG00000129173 4.221746e-47 102.7233
## ENSG00000166851 1.026575e-46 101.9209
## ENSG00000072571 1.026575e-46 101.8864
## ENSG00000171241 1.278382e-46 101.6052
volp1 <- volcanoplot(fit1, coef=meta, highlight = 4, names=gl)

map1 <- plotMA(fit1, main = "did not correct for cancer type")

summary(decideTests(fit1)) # 5309 down; 4652 up
##        (Intercept) Metastasisyes
## Down          1181          4843
## NotSig          95          6105
## Up           14303          4631
draw_SigCounts(tt1)

plot(density(tt1$logFC), col="skyblue", lwd=3, main="logFC density (design 1)",xlab="logFC")

  • Acutally the limma-voom results were similar after removing some samples with extremely low library size, while the limma-trend results were slightly better.

Swarm plots for top genes in each cancer type

Wilcoxon rank sum test for comparison in each tissue
geneList <- rownames(tt1[1:3,])
draw_boxplot(dge_filt$counts,tt1,new_cdr,geneList[1])
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

draw_boxplot(dge_filt$counts,tt1,new_cdr,geneList[2])
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

draw_boxplot(dge_filt$counts,tt1,new_cdr,geneList[3])
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

Draw the heatmap for the top 50 DEGs from limma-voom results of metastasis v.s. non-metastasis comparison only.

Try annotation:
  1. cancer type;
  2. ajcc_pathologic_tumor_stage (this reflects the stage at initial pathologic diagnosis)
select_gene <- rownames(tt1[(tt1$adj.P.Val < 0.05 & abs(tt1$logFC) > log2(2)),])

h1 <- draw_heatmap(v1, tt1, new_cdr, anno, select_gene[1:50])


2. two factor model
design2 <- model.matrix(~ type + Metastasis, data = new_cdr)
head(design2,3)
##                              (Intercept) typeBRCA typeCESC typeCHOL
## TCGA-2Z-A9J3-01A-12R-A38C-07           1        0        0        0
## TCGA-EJ-7784-01A-11R-2118-07           1        0        0        0
## TCGA-MJ-A850-01A-11R-A355-07           1        0        0        0
##                              typeCOAD typeESCA typeGBM typeHNSC typeKICH
## TCGA-2Z-A9J3-01A-12R-A38C-07        0        0       0        0        0
## TCGA-EJ-7784-01A-11R-2118-07        0        0       0        0        0
## TCGA-MJ-A850-01A-11R-A355-07        0        0       0        0        0
##                              typeKIRC typeKIRP typeLGG typeLIHC typeLUAD
## TCGA-2Z-A9J3-01A-12R-A38C-07        0        1       0        0        0
## TCGA-EJ-7784-01A-11R-2118-07        0        0       0        0        0
## TCGA-MJ-A850-01A-11R-A355-07        0        0       0        0        0
##                              typeLUSC typeMESO typeOV typePAAD typePCPG
## TCGA-2Z-A9J3-01A-12R-A38C-07        0        0      0        0        0
## TCGA-EJ-7784-01A-11R-2118-07        0        0      0        0        0
## TCGA-MJ-A850-01A-11R-A355-07        0        0      0        0        0
##                              typePRAD typeREAD typeSARC typeSKCM typeSTAD
## TCGA-2Z-A9J3-01A-12R-A38C-07        0        0        0        0        0
## TCGA-EJ-7784-01A-11R-2118-07        1        0        0        0        0
## TCGA-MJ-A850-01A-11R-A355-07        0        0        1        0        0
##                              typeTGCT typeTHCA typeTHYM typeUCEC typeUCS
## TCGA-2Z-A9J3-01A-12R-A38C-07        0        0        0        0       0
## TCGA-EJ-7784-01A-11R-2118-07        0        0        0        0       0
## TCGA-MJ-A850-01A-11R-A355-07        0        0        0        0       0
##                              Metastasisyes
## TCGA-2Z-A9J3-01A-12R-A38C-07             0
## TCGA-EJ-7784-01A-11R-2118-07             0
## TCGA-MJ-A850-01A-11R-A355-07             0
v2 <- voom(dge_filt, design2, plot = T)

fit2 <- lmFit(v2, design2)
fit2 <- eBayes(fit2)
tt2 <- topTable(fit2, number=Inf,genelist = gl, coef=meta)
head(tt2, 20)
##                      ID      logFC     AveExpr         t      P.Value
## ENSG00000159217 IGF2BP1  0.3483215  0.84838389  5.697570 1.272091e-08
## ENSG00000170689   HOXB9  0.6554478 -0.11063298  5.555780 2.880689e-08
## ENSG00000040275   SPDL1  0.1630758  3.41878236  5.460150 4.945289e-08
## ENSG00000121552    CSTA -0.5232659  2.36487379 -5.365425 8.373709e-08
## ENSG00000115525 ST3GAL5 -0.2853079  4.03356596 -5.222934 1.819527e-07
## ENSG00000163815  CLEC3B -0.4002104  2.81816410 -4.864235 1.177866e-06
## ENSG00000160949   TONSL  0.1888062  4.24421099  4.794452 1.669946e-06
## ENSG00000117724   CENPF  0.2398036  5.07582928  4.755581 2.024301e-06
## ENSG00000050405   LIMA1 -0.1872162  6.63859912 -4.693093 2.749874e-06
## ENSG00000183145 RIPPLY3  0.3281734  0.04946417  4.753670 2.043468e-06
## ENSG00000160957  RECQL4  0.2257792  4.20401436  4.658038 3.260159e-06
## ENSG00000088305  DNMT3B  0.2390571  2.35754636  4.602601 4.257072e-06
## ENSG00000198901    PRC1  0.1902078  4.93012754  4.579998 4.742306e-06
## ENSG00000156970   BUB1B  0.1905644  3.41303079  4.551102 5.440164e-06
## ENSG00000183347    GBP6 -0.4282269  0.28070807 -4.583062 4.673547e-06
## ENSG00000112984  KIF20A  0.2216134  3.51266330  4.475419 7.764904e-06
## ENSG00000179163   FUCA1 -0.1765665  5.75068659 -4.476753 7.716744e-06
## ENSG00000183150   GPR19  0.2916359 -0.22194358  4.558829 5.244458e-06
## ENSG00000139880   CDH24  0.1930271  3.32901648  4.444816 8.952587e-06
## ENSG00000198768 APCDD1L  0.5021947 -0.80632787  4.499216 6.947104e-06
##                    adj.P.Val        B
## ENSG00000159217 0.0001981790 8.606308
## ENSG00000170689 0.0002243913 8.096309
## ENSG00000040275 0.0002568089 8.028858
## ENSG00000121552 0.0003261350 7.480696
## ENSG00000115525 0.0005669282 6.822465
## ENSG00000163815 0.0030583294 4.989670
## ENSG00000160949 0.0035372434 4.807785
## ENSG00000117724 0.0035372434 4.615470
## ENSG00000050405 0.0042840291 4.322353
## ENSG00000183145 0.0035372434 4.234120
## ENSG00000160957 0.0046172748 4.190014
## ENSG00000088305 0.0052771703 3.906270
## ENSG00000198901 0.0052771703 3.829545
## ENSG00000156970 0.0052970200 3.728153
## ENSG00000183347 0.0052771703 3.697699
## ENSG00000112984 0.0060743694 3.396772
## ENSG00000179163 0.0060743694 3.377292
## ENSG00000183150 0.0052970200 3.349265
## ENSG00000139880 0.0066415405 3.275995
## ENSG00000198768 0.0060743694 3.159289
volp2 <- volcanoplot(fit2, coef=meta, highlight = 6, names=gl)

map2 <- plotMA(fit2, main = "correct for cancer type")

draw_SigCounts(tt2)

plot(density(tt2$logFC), col="steelblue", lwd=3, main="logFC density (design 2)",xlab="logFC")

geneList2 <- rownames(tt2[1:3,])
draw_boxplot(dge_filt$counts,tt2,new_cdr,geneList2[1])
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

draw_boxplot(dge_filt$counts,tt2,new_cdr,geneList2[2])
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

draw_boxplot(dge_filt$counts,tt2,new_cdr,geneList2[3])
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default

select_gene2 <- rownames(tt2[(tt2$adj.P.Val < 0.05 & abs(tt2$logFC) > log2(1.2)),])
h2 <- draw_heatmap(v2, tt2, new_cdr, anno, select_gene2)

Overlapped genes in the results of design 1 & 2

gene_d1 <- rownames(tt1[tt1$adj.P.Val < 0.05,])
gene_d2 <- rownames(tt2[tt2$adj.P.Val < 0.05,])
ven1 <- venn(list(design1=gene_d1, design2=gene_d2), intersections = T)

isect <- attr(ven1, "intersection")
d2_only <- isect$design2
anno[d2_only,]$geneNames
##  [1] LIMA1    CLP1     KLF2     PIANP    APOBEC3A CCDC69   RAB38   
##  [8] IL1B     LACTB    DENND2C  RNFT2    CDC42    GTF2B    PFN2    
## [15] SCN8A    ENO2     SLCO1A2  PCDHB10  FAM46C   SMPD3    CYP4B1  
## [22] C15orf52 STX11    ZFP41    PHC1     ALOX5AP  UPF3A    ZNF517  
## [29] FAM162B  ZC2HC1A  F13A1    SLC30A3  RHOA     TMEFF1   KIF5A   
## [36] PIK3CG   CLEC10A  KLRD1    MGAT5B   B4GALNT1 CA3      PPP2R2A 
## [43] C1orf21  FZD8     DNAH11   VAMP3    SBK1     ECHDC1   ZNF250  
## [50] SH3BP5L  SPECC1   SRGN     RNF182   GDE1    
## 56638 Levels: 5S_rRNA 7SK A1BG A1BG-AS1 A1CF A2M A2M-AS1 A2ML1 ... ZZZ3
tt1[d2_only,]$adj.P.Val
##  [1] 0.30338066 0.19314615 0.75624681 0.60617291 0.18927227 0.21836588
##  [7] 0.28809582 0.70717509 0.06290863 0.05712739 0.10336526 0.05427154
## [13] 0.58058122 0.57221630 0.44297623 0.83563621 0.67435176 0.77201654
## [19] 0.92883812 0.23529296 0.16387922 0.08044839 0.98393208 0.11715261
## [25] 0.34316408 0.12469296 0.56774195 0.22240135 0.16469080 0.61224165
## [31] 0.92359692 0.09247493 0.15267000 0.76679880 0.16471955 0.11404533
## [37] 0.70448115 0.93016051 0.07557638 0.07940169 0.14947612 0.16328159
## [43] 0.31679835 0.93408102 0.10672770 0.22666582 0.88481008 0.50200915
## [49] 0.46268151 0.41561274 0.12954772 0.15073853 0.05838829 0.70965129
tt2[d2_only,]$adj.P.Val
##  [1] 0.004284029 0.007486101 0.007486101 0.007486101 0.007495182
##  [6] 0.010675393 0.011557523 0.011557523 0.012230752 0.012645254
## [11] 0.013799841 0.013211830 0.013940479 0.013940479 0.013940479
## [16] 0.015462693 0.014556343 0.015756992 0.016977760 0.019050675
## [21] 0.019050675 0.020097219 0.020097219 0.021016541 0.021749304
## [26] 0.023570389 0.022648133 0.024064875 0.023570389 0.025687755
## [31] 0.025687755 0.025631322 0.026712462 0.028316918 0.031403939
## [36] 0.034072174 0.034487562 0.035582388 0.036964470 0.038123402
## [41] 0.036964470 0.036964470 0.036964470 0.038890980 0.038921748
## [46] 0.036964730 0.040674845 0.038123402 0.042913677 0.040026636
## [51] 0.043620728 0.044213687 0.048490775 0.047522907

Save files

w_dir <- "/home/ubuntu/storage/tcga/dea/whole"
write.table(tt1, file=file.path(w_dir,"table/all.types.d1.txt"), sep="\t", quote=F)
pdf(file.path(w_dir,"plot/all.d1.volcanoplot.pdf"))
  volp1
## NULL
dev.off()
## png 
##   2
pdf(file.path(w_dir,"plot/all.d1.maplot.pdf"))
  map1
## NULL
dev.off()
## png 
##   2
write.table(tt2, file=file.path(w_dir,"table/all.types.d2.txt"), sep="\t", quote=F)
pdf(file.path(w_dir,"table/all.d2.volcanoplot.pdf")) 
  volp2
## NULL
dev.off()
## png 
##   2
pdf(file.path(w_dir,"plot/all.d2.maplot.pdf"))
  map2
## NULL
dev.off()
## png 
##   2

limma-trend

If the sequencing depth is reasonably consistent across the RNA samples. This approach will usually work well if the ratio of the largest library size to the smallest is not more than about 3-fold. It’s faster than limma-voom though using more memories.

## library size:
countsSum <- colSums(dge_filt$counts)
max(countsSum) / min(countsSum) # 55.29538 

logCPM <- cpm(dge_filt, log=T, prior.count = 2.5)
fit_lt <- lmFit(logCPM, design1)
fit_lt <- eBayes(fit_lt, trend = T)
tt_lt <- topTable(fit_lt, number=Inf,genelist = gl, coef=meta)
head(tt_lt,20)
volcanoplot(fit_lt, coef=meta, highlight = 4, names=gl)
plotMA(fit_lt, main = "no correct for cancer type")